org 100h   ; assume ax=bx=0 ch=0

  pop di   ; di=sp=0

;Prepare floating-point constants for SSE
;[0xfff0]=0xffe00000, [0xffe0]=0xffc00000, ... [0x8000]=0
;step 0x00200000: ... 1 1.25 1.5 1.75  2 2.5 3 3.5  4 5 6 7 ...
PK:
  mov cl,4
  sub ax,0x20
PKL:
  push ax  ; x
  push bx  ; 0
  loop PKL ; store four times
  jnz PK   ; loop 2048 times -> 32kB; sp=0x8000


  mov al,13h
  int 10h
  push 0xa000
  pop es
  fninit

;;palette test
;SEGMENTS equ 8
;XRUN     equ 6
;YRUN     equ 6
;START    equ 0
;
;%if YRUN<256
;  %define mov__cx_YRUN mov cl,YRUN
;%else
;  %define mov__cx_YRUN mov cx,YRUN
;%endif
;
;%if XRUN<256
;  %define mov__cx_XRUN mov cl,XRUN
;%else
;  %define mov__cx_XRUN mov cx,XRUN
;%endif
;
;  pusha
;  salc
;  scasw
;PW mov__cx_YRUN
;PY mov bx,SEGMENTS
;  pusha
;  add al,START
;PX mov__cx_XRUN
;  rep stosb
;  inc ax
;  dec bx
;  jnz PX
;  popa
;  add di,320
;  loop PY
;%if SEGMENTS<256
;  add al,SEGMENTS
;  jnc PW
;%endif
;  popa

;Palette: Luminance * Hue: diffuse = L*[0.2,H,1], specular = L^9 / 2
  mov dx,3c8h
  xor ax,ax
  out dx,al
  inc dx
PAL:
  or bx,0b0000111100001111  ; bx = LLLL.... HHHH....

;Color
  push dx    ; b=0.78
  push bx    ; g=H
  push ax    ; r=last blue output (0..0.25)

;Specular
  mov al,bh
POW:
  mul al
  mov al,ah
  inc si
  jpo POW    ; 3 times
  shr ah,1   ; cl=L^8/2 (0..127)
  mov cl,ah

;Diffuse, add with saturation
MAD:
  pop ax     ; rgb
  add al,cl  ; al=L^8/2 + rgb
  jnc SAT
  salc       ; clamp to 0..255
SAT:
  mul bh     ; ah=L*clamp(L^8/2 + rgb)
  shr ax,10
  out dx,al
  dec si
  jpo MAD    ; 3 times

  inc bx
  jnz PAL

;;palette test
;  xor ax,ax  ; wait for a key
;  int 16h
;  mov ax,3   ; textmode
;  int 10h
;  ret

%define K(x) [0x8000 + 0x10*(x/0x20)]
%define K_0_25        K(0x3e80)  ; 0.25
%define K_TIME_DELTA  K(0x3c00)  ; 0.0078125
%define K_EPS         K(0x3ca0)  ; 0.01953125 = 20/1024
%define K_LIGHT_SCALE K(0x4440)  ; 768 = 15/EPS
%define K_HUE_SCALE   K(0x41e0)  ; 28 = 16 steps * 1.75
%define K_NEG_ABS     K(0x8000)  ; -0 = 0x80000000 for -abs()
%define K_MINUS1      K(0xbf80)  ; -1
%define K_1           K(0x3f80)  ; 1

;For 16:9 screens: pixel aspect ratio = 1.03
%define K_X_SCALE     K(0x3020)  ; 2.5 * 2**-32: x -> ..1.25
%define K_Y_SCALE     K(0x2fe0)  ; 1.75 * 2**-32: y -> ..0.6836

;For 4:3 screens: pixel aspect ratio = 0.96
;%define K_X_SCALE     K(0x3000)  ; 2.0 * 2**-32: x -> ..1.0
;%define K_Y_SCALE     K(0x3000)  ; 2.0 * 2**-32: y -> ..0.7813

  fldz             ;| t=0

;For each frame: prepare rotation constants
M fadd dword K_TIME_DELTA ;| t+=dt
  fld st0
  fsincos          ;| C1 S1 t
  fldln2
  fmul st3         ;| 0.69314718*t C1 S1 t
  fsincos          ;| C2 S2 C1 S1 t
  fldl2e
  fmul st5         ;| 1.44269504*t C2 S2 C1 S1 t
  fsincos          ;| C3 S3 C2 S2 C1 S1 t

;Store each constant four times
  mov bx,0xa020
STORE:
  mov cl,4
STORE4:
  fst dword[bx]    ;0xa000 10 20 30 40 50 60 70 80
  add bl,4         ;    XY    C3 S3 C2 S2 C1 S1 scratch
  loop STORE4
  fstp st0
  jns STORE        ; loop 4 times: bx=0xa080

%define COS [bx]
%define SIN [bx+0x10]

;For each pixel: store x,y coordinates
X mov bx,es
  mov cl,4
X4:
  mov ax,0xcccd
  mul di
  add dx,0x9b80
  mov [bx],ax
  mov [bx+2],dx
  add bl,4
  inc di
  loop X4      ; di+=4 bx=0xa010

%define INT_X [bx-1]  ; x = 2^32 * (-0.5..0.5)
%define INT_Y [bx]    ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

%define x xmm0 ; XYZ coordinates for iteration
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define c xmm6 ; translation [c,c/4,0]
%define d xmm7 ; depth

;Trace steps along a ray
  mov cl,16
  movaps d,K_MINUS1; d=-1
  xorps a,a
T addps d,a        ; d+=map(X,Y,d)
  call MAP
  loop T

;Normal, ambient occlusion
  movaps [bx],a    ; bx=0xa080
  subps d,K_EPS
  call MAP         ; a = map(X,Y,d-EPS)
  subps a,[bx]     ; a = map(X,Y,d-EPS) - map(X,Y,d)

;Depth fog
  movaps b,K_1
  subps b,d
  minps b,K_1
  mulps a,b        ; a *= min(1-d,1)

;Color
  mulps a,K_LIGHT_SCALE
  mulps o,K_HUE_SCALE
  cvtps2dq a,a
  cvtps2dq o,o
  pslld a,4
  paddd a,o        ; color index = (L<<4) + H
  packssdw a,a
  packuswb a,a     ; clamp to 0..255

;Next pixel
  movd [es:di-4],a
  test di,di
  jnz X

;Esc test, next frame
  in al,0x60
  dec al
  jnz M   ; fallthrough

MAP:
  mov bx,es
  movups x,INT_X
  cvtdq2ps y,INT_Y
  cvtdq2ps x,x
  mulps x,K_X_SCALE ; x: -1..1
  mulps y,K_Y_SCALE
  movaps z,d    ; x,y,z = X,Y,depth

  xorps o,o    ; o=0
  movaps c,K_0_25 ; c=K: translation=[c,c/4,0]
  mov ch,18    ; number of iterations

;Rotate in the XZ, YX and ZY planes
L mov bl,0x20
R movaps b,COS ; b=C3 a=S3 | b=C2 a=S2 | b=C1 a=S1
  movaps a,SIN
  mulps b,z    ; b=Cz
  mulps z,a    ; z=Sz
  mulps a,x    ; a=Sx
  mulps x,COS  ; x=Cx
  subps a,b    ; a=x'=Sx-Cz
  addps z,x    ; z=z'=Sz+Cx
  movaps x,y   ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  add bl,0x20  ; 0x20 | 0x40 | 0x60
  jns R        ; bx=0xa080

;Reflect along x and y
  orps x,K_NEG_ABS  ; x=-abs(x)
  orps y,K_NEG_ABS  ; y=-abs(y)

;Translate
  movaps a,c
  mulps a,K_0_25 ; a=K*c
  addps x,c    ; x+=c
  addps y,a    ; y+=K*c

  subps c,a    ; c*=1-K: scale translation

;Orbit trap = squared distance to [0,0,0]
  movaps a,x
  movaps b,y
  mulps a,a    ; a=x*x
  mulps b,b    ; b=y*y
  addps b,a    ; b=x*x+y*y
  movaps a,z
  mulps a,a    ; a=z*z
  addps b,a    ; b=length^2=x*x+y*y+z*z
  maxps o,b    ; o=max(o,length^2)

;Next iteration
  dec ch
  jnz L

;Distance to a little sphere
  rsqrtps a,b  ; a=(length^2)^(-1/2)
  mulps a,b    ; a=(length^2)^(-1/2 + 1) = length

  subps a,c
  subps a,c    ; a=length-2c: radius = 2*final translation
  ret          ; bx=0xa080
